This is an R Markdown document containing code and plots for Abumazen et al. “Nasopharyngeal metatranscriptomics reveals host-pathogen signatures of pediatric sinusitis”.

Loading required R packages

library(reshape2)
library(pheatmap)
library(pROC)
library(RColorBrewer)
library(dplyr)
library(plyr)
library(vioplot)
library(Metrics)
library(DESeq2)
library(tximport)
library(EnhancedVolcano)
library(patchwork)
library(knitr)
library(vegan)
library(ggplot2)

1. Kraken2-based pathogen detection from RNA-seq data

Loading data files


Reading in the kraken2 data

tb = read.delim("d1-d5_fastp_dehost_krakenmerged_reads_normalizedPlus14PosControls.txt",sep='\t',col.names=(c("X","Y","Z")))

matr = dcast(tb,Y ~ X,value.var = "Z")
rownames(matr) = matr[,1]
matr = matr[,-1]

for (i in 1:nrow(matr))
{   matr[i,which(is.na(matr[i,]))] = 0
}

matr = t(data.matrix(matr))

splitrownames = vector(length = nrow(matr))
for (i in 1:length(splitrownames))
{
    rownames(matr)[i] = strsplit(rownames(matr),"_")[[i]][1]
}

Reading in the metadata

metadata = read.csv("metadata.11.26.24.csv",header=T)
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1

shared = intersect(rownames(matr),metadata[,1])
matr = matr[match(shared,rownames(matr)),]
metadata = metadata[match(shared,metadata[,1]),]

Diversity analysis and ordinations

Computing NMDS ordinations

# Step 1: Compute Bray-Curtis Dissimilarity Matrix
beta_dissimilarity <- vegdist(matr, method = "bray")

# Step 2: Perform PCoA
pcoa_result <- cmdscale(beta_dissimilarity, k = 2, eig = TRUE)  # k = 2 for 2D

# Step 3: Prepare Data for ggplot
pcoa_data <- data.frame(
  PCoA1 = pcoa_result$points[, 1],
  PCoA2 = pcoa_result$points[, 2],
  bv_both_none = metadata$bv_both_none
)

# Step 4: Plot PCoA Results
ggplot(pcoa_data, aes(x = PCoA1, y = PCoA2, color = factor(bv_both_none))) +
  geom_point(size = 3) +
  labs(
    title = "PCoA Ordination (Bray-Curtis)",
    x = "PCoA Dimension 1",
    y = "PCoA Dimension 2",
    color = "BV Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.title = element_text(size = 12)
  )

Comparing alpha-diversity across groups

# Compute alpha diversity metrics
alpha_diversity <- data.frame(
  Shannon = diversity(matr, index = "shannon"),  # Shannon index
  Simpson = diversity(matr, index = "simpson"),  # Simpson index
  Richness = rowSums(matr > 0),  # Richness (number of non-zero species)
  bv_both_none = metadata$bv_both_none  # Add grouping information
)

# Ensure bv_both_none is treated as a factor
alpha_diversity$bv_both_none <- as.factor(alpha_diversity$bv_both_none)

# Plot alpha diversity (Shannon index) by group
ggplot(alpha_diversity, aes(x = bv_both_none, y = Shannon, fill = bv_both_none)) +
  geom_boxplot() +
  labs(
    title = "Alpha Diversity (Shannon Index)",
    x = "BV Category",
    y = "Shannon Diversity"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    legend.position = "none"
  )

Statistical comparisons

# Perform Kruskal-Wallis test
kruskal_result <- kruskal.test(Shannon ~ bv_both_none, data = alpha_diversity)
print(kruskal_result)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Shannon by bv_both_none
## Kruskal-Wallis chi-squared = 7.0048, df = 4, p-value = 0.1356
# Post-hoc pairwise comparisons (Dunn's test)
library(FSA)  # For Dunn's test
dunn_result <- dunnTest(Shannon ~ bv_both_none, data = alpha_diversity, method = "bh")
print(dunn_result)
##    Comparison          Z    P.unadj     P.adj
## 1       0 - 1  0.9040555 0.36596600 0.5228086
## 2       0 - 2  2.0119384 0.04422643 0.2211321
## 3       1 - 2  1.3027301 0.19266692 0.3853338
## 4       0 - 3  2.0429169 0.04106066 0.4106066
## 5       1 - 3  1.2362064 0.21638186 0.3606364
## 6       2 - 3 -0.4341663 0.66416772 0.6641677
## 7       0 - 4  1.9575507 0.05028276 0.1676092
## 8       1 - 4  1.4140971 0.15733339 0.3933335
## 9       2 - 4  0.5438673 0.58653282 0.6517031
## 10      3 - 4  0.8494120 0.39565209 0.4945651
 detach("package:FSA", unload = TRUE)

Now, comparing between group 0 (no pathogen detected) and groups 1-3 (pathogens detected)

# Create a binary grouping variable
alpha_diversity$group_binary <- ifelse(alpha_diversity$bv_both_none == 0, "Group 0", "Groups 1-3")

wilcox_result <- wilcox.test(Shannon ~ group_binary, data = alpha_diversity)
print(wilcox_result)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Shannon by group_binary
## W = 2570, p-value = 0.04197
## alternative hypothesis: true location shift is not equal to 0
# Boxplot with binary groups
ggplot(alpha_diversity, aes(x = group_binary, y = Shannon, fill = group_binary)) +
  geom_boxplot() +
  labs(
    title = "Comparison of Group 0 vs. Groups 1-3",
    x = "Group",
    y = "Shannon Diversity"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5)
  )

Detection of indicator species through differential species abundance analysis

group0 = which(metadata$bv_both_none == 0)
group1 = which(metadata$bv_both_none == 1)
group2 = which(metadata$bv_both_none == 2)
group3 = which(metadata$bv_both_none == 3)

# group1 (bacterial) vs group0 (none) comparison


pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))

for (i in 1:ncol(matr))
{
  pvals[i] = wilcox.test(matr[group1,i],matr[group0,i])$p.value
  fc[i] = log((mean(matr[group1,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}


cols = vector(length = ncol(matr))
cols[] = "gray"
cols[which(colnames(matr) == "Haemophilus influenzae")] = "green"
cols[which(colnames(matr) == "Moraxella catarrhalis")] = "red"
cols[which(colnames(matr) == "Streptococcus pneumoniae")] = "purple"

pvals[which(is.nan(pvals))] = 1


plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)

colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
## [1] "[Haemophilus] ducreyi"     "Haemophilus haemolyticus" 
## [3] "Haemophilus influenzae"    "Moraxella catarrhalis"    
## [5] "Moraxella nonliquefaciens" "Streptococcus pneumoniae"
# group2 (viral) vs group0 (none) comparison


pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))

for (i in 1:ncol(matr))
{
  pvals[i] = wilcox.test(matr[group2,i],matr[group0,i])$p.value
  fc[i] = log((mean(matr[group2,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}


cols = vector(length = ncol(matr))
cols[] = "gray"
cols[which(colnames(matr) == "Influenza A virus")] = "blue"
cols[which(colnames(matr) == "Rhinovirus A")] = "orange"


pvals[which(is.nan(pvals))] = 1


plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)

colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
## [1] "Influenza A virus" "Rhinovirus A"
# group3 (viral) vs group0 (none) comparison


pvals = vector(length = ncol(matr))
fc = vector(length = ncol(matr))

for (i in 1:ncol(matr))
{
  pvals[i] = wilcox.test(matr[group3,i],matr[group0,i])$p.value
  fc[i] = log((mean(matr[group3,i]) + 1)/(mean(matr[group0,i]) + 1),base=2)
}


cols = vector(length = ncol(matr))
cols[] = "gray"

cols[which(colnames(matr) == "Haemophilus influenzae")] = "green"
cols[which(colnames(matr) == "Moraxella catarrhalis")] = "red"
cols[which(colnames(matr) == "Streptococcus pneumoniae")] = "purple"


cols[which(colnames(matr) == "Influenza A virus")] = "blue"
cols[which(colnames(matr) == "Rhinovirus A")] = "orange"



pvals[which(is.nan(pvals))] = 1


plot(fc,-log(pvals,10),col=alpha(cols,0.5),pch=19)

colnames(matr)[intersect(which(abs(fc) >= 1),which(pvals <= 0.05))]
##  [1] "Aggregatibacter actinomycetemcomitans"
##  [2] "Giardia intestinalis"                 
##  [3] "Haemophilus haemolyticus"             
##  [4] "Haemophilus influenzae"               
##  [5] "Haemophilus parahaemolyticus"         
##  [6] "Haemophilus parainfluenzae"           
##  [7] "Moraxella catarrhalis"                
##  [8] "Rhinovirus A"                         
##  [9] "Rhodopseudomonas palustris"           
## [10] "Streptococcus pneumoniae"             
## [11] "Treponema denticola"

Bacterial predictions


Heatmap

annotation_row = metadata[,c(22,24,26)]
annotation_row = (annotation_row -2) * -1   #just to make it going 0 to 1
rownames(annotation_row) = metadata[,1]
colnames(annotation_row) = c("SPN", "HFLU", "MCAT")


cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

ann_colors = list(
    SPN = c("white", "light gray"),
    HFLU = c("white", "light gray"),
    MCAT = c("white", "light gray")
)


pheatmap(log(matr[,c(grep("Moraxella cat", colnames(matr)),grep("Streptococcus pne", colnames(matr)),grep("Haemophilus inf", colnames(matr)))] + 1, 10),annotation_row=annotation_row[,c(2,1,3)],cluster_cols = F, show_rownames = FALSE,col=cols,annotation_colors = ann_colors)

Box Plots

metadata[which(is.na(metadata[,"mcat"])),"mcat"] = "TrueNEG"

group = metadata[,"mcat"]

group <- mapvalues(group, 
          from=c("TrueNEG","2","1"), 
          to=c("TrueNEG","-","+"))
    group = as.factor(group)

group <- factor(group, levels = c("TrueNEG", "-", "+"))  # Set the desired order

    
abund = log(matr[,"Moraxella catarrhalis"] + 1,10)
  p = ggplot(data.frame(mcat = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("mcat") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

metadata[which(is.na(metadata[,"hflu"])),"hflu"] = "TrueNEG"

group = metadata[,"hflu"]

group <- mapvalues(group, 
          from=c("TrueNEG","2","1"), 
          to=c("TrueNEG","-","+"))
    group = as.factor(group)

group <- factor(group, levels = c("TrueNEG", "-", "+"))  # Set the desired order
    
abund = log(matr[,"Haemophilus influenzae"] + 1,10)
  p = ggplot(data.frame(hflu = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("hflu") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

metadata[which(is.na(metadata[,"spn"])),"spn"] = "TrueNEG"

group = metadata[,"spn"]

group <- mapvalues(group, 
          from=c("TrueNEG","2","1"), 
          to=c("TrueNEG","-","+"))
    group = as.factor(group)

group <- factor(group, levels = c("TrueNEG", "-", "+"))  # Set the desired order

abund = log(matr[,"Streptococcus pneumoniae"] + 1,10)
  p = ggplot(data.frame(spn = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("spn") +     theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") + 
  geom_point(pch = 21, position = position_jitterdodge())

  p

M. catarrhalis prediction accuracy

metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]

bacterial_detection_threshold = 3 # gives a good balance of sensitivity and specificity

# number of HFLU detected

hflu = which(matr.Valid[,"Haemophilus influenzae"] >= bacterial_detection_threshold)
length(hflu)
## [1] 81
# number of SPN detected
spn = which(matr.Valid[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
length(spn)
## [1] 73
# number of MCAT detected
mcat = which(matr.Valid[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
length(mcat)
## [1] 137
#any of the above three
length(unique(c(hflu,spn,mcat)))
## [1] 177
#two or more
length(unique(c(intersect(hflu,mcat),intersect(hflu,spn),intersect(spn,mcat))))
## [1] 89
#all three
length(intersect(intersect(hflu,mcat),spn))
## [1] 25
values = unique(rev(sort(matr.Valid[,"Moraxella catarrhalis"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Moraxella catarrhalis"] >= k)
    TPR[i] = length(intersect(matches,which(metadata.Valid[,"mcat"] == 1))) / length(which(metadata.Valid[,"mcat"] == 1))
    FPR[i] = length(intersect(matches,which(metadata.Valid[,"mcat"] == 2))) / length(which(metadata.Valid[,"mcat"] == 2)) 
} 


prediction = matr.Valid[,"Moraxella catarrhalis"]
response = as.numeric(metadata.Valid[,"mcat"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.823844
auc_mcat = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("mcat","AUC = ",round(auc_mcat,2)))


matches = which(matr.Valid[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"mcat"] == 1))) / length(which(metadata.Valid[,"mcat"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"mcat"] == 2))) / length(which(metadata.Valid[,"mcat"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("mcat",tpr0 * 100,100-(fpr0 * 100)))
## [1] "mcat"             "84.7457627118644" "64.0776699029126"
sens_mcat = tpr0 * 100
spec_mcat = 100-(fpr0 * 100)

S. pneumoniae prediction accuracy

metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]


values = unique(rev(sort(matr.Valid[,"Streptococcus pneumoniae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr.Valid[,"Streptococcus pneumoniae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata.Valid[,"spn"] == 1))) / length(which(metadata.Valid[,"spn"] == 1))
    FPR[i] = length(intersect(matches,which(metadata.Valid[,"spn"] == 2))) / length(which(metadata.Valid[,"spn"] == 2)) 
} 

prediction = matr.Valid[,"Streptococcus pneumoniae"]
response = as.numeric(metadata.Valid[,"spn"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.8915326
auc_spn = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("spn","AUC = ",round(auc_spn,2)))

matches = which(matr.Valid[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"spn"] == 1))) / length(which(metadata.Valid[,"spn"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"spn"] == 2))) / length(which(metadata.Valid[,"spn"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("spn",tpr0 * 100,100-(fpr0 * 100)))
## [1] "spn"              "81.4285714285714" "89.4039735099338"
sens_spn = tpr0 * 100
spec_spn = 100-(fpr0 * 100)

H. influenzae prediction accuracy

metadata.Valid = metadata[which(metadata[,"batch"] != 6),]
matr.Valid = matr[which(metadata[,"batch"] != 6),]


values = unique(rev(sort(matr.Valid[,"Haemophilus influenzae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr.Valid[,"Haemophilus influenzae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata.Valid[,"hflu"] == 1))) / length(which(metadata.Valid[,"hflu"] == 1))
    FPR[i] = length(intersect(matches,which(metadata.Valid[,"hflu"] == 2))) / length(which(metadata.Valid[,"hflu"] == 2)) 
} 

prediction = matr.Valid[,"Haemophilus influenzae"]
response = as.numeric(metadata.Valid[,"hflu"])
response = (response -2) * -1
auc(response,prediction)
## [1] 0.9549669
auc_hflu = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("hflu","AUC = ",round(auc_hflu,2)))

matches = which(matr.Valid[,"Haemophilus influenzae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata.Valid[,"hflu"] == 1))) / length(which(metadata.Valid[,"hflu"] == 1))
fpr0 = length(intersect(matches,which(metadata.Valid[,"hflu"] == 2))) / length(which(metadata.Valid[,"hflu"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("hflu",tpr0 * 100,100-(fpr0 * 100)))
## [1] "hflu"             "94.2857142857143" "90.0662251655629"
sens_hflu = tpr0 * 100
spec_hflu = 100-(fpr0 * 100)


accuracies = cbind(sens = c(mcat = sens_mcat,spn = sens_spn,hflu = sens_hflu),spec = c(mcat = spec_mcat,spec_spn,spec_hflu),auc = c(auc_mcat, auc_spn, auc_hflu))

kable(accuracies)
sens spec auc
mcat 84.74576 64.07767 0.8238440
spn 81.42857 89.40397 0.8915326
hflu 94.28571 90.06623 0.9549669

Viral predictions


Heatmap

viral_reads = matrix(ncol=12,nrow = nrow(matr.Valid))

colnames(viral_reads) = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")

rownames(viral_reads) = rownames(matr.Valid)

viral_reads[,"INFA"] = matr.Valid[,"Influenza A virus"]

viral_reads[,"INFB"] = matr.Valid[,"Influenza B virus"]

viral_reads[,"INFC"] = matr.Valid[,"Influenza C virus"]

viral_reads[,"MPV"] = matr.Valid[,"Human metapneumovirus"]

viral_reads[,"RSV"] = matr.Valid[,"Respiratory syncytial virus"] + matr.Valid[,"Human orthopneumovirus"]

viral_reads[,"HRV"] = matr.Valid[,"Rhinovirus A"] + matr.Valid[,"Rhinovirus B"] + matr.Valid[,"Rhinovirus C"]

viral_reads[,"PIV1"] = matr.Valid[,"Human respirovirus 1"]

viral_reads[,"PIV2"] = matr.Valid[,"Human orthorubulavirus 2"]

viral_reads[,"PIV3"] = matr.Valid[,"Human respirovirus 3"]

viral_reads[,"PIV4"] = matr.Valid[,"Human orthorubulavirus 4"]

viral_reads[,"ADV"] = matr.Valid[,"Human mastadenovirus B"] + matr.Valid[,"Human mastadenovirus C"]

viral_reads[,"EVD68"] = matr.Valid[,"Enterovirus D"]



#computing the number of samples containing 0, 1, 2, 3, etc. viruses
viruses_presenceAbsence = viral_reads
viruses_presenceAbsence[viruses_presenceAbsence != 0] = 1
table(apply(viruses_presenceAbsence,1,sum))
## 
##  0  1  2  3  4  5  6 
## 46 74 65 24  8  3  1
#frequency of viral detections
rev(sort(apply(viruses_presenceAbsence,2,sum)))
##   HRV   MPV  INFA  PIV4  PIV3   RSV EVD68  INFB   ADV  PIV1  INFC  PIV2 
##   100    32    29    28    28    25    23    16    15    15    11     7
tomap = log(viral_reads + 1, 10)

annotation = metadata.Valid[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata.Valid[,1]
colnames(annotation) = rev(colnames(tomap))

cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

for (i in 1:ncol(annotation))
{
    annotation[,i] = as.factor(findInterval(annotation[,i],c(10,20,30,40)))

}

ann_colors = list(
    INFA = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFB = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFC = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    MPV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    RSV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    HRV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV1 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV2 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV3 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV4 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    ADV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    EVD68 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black")
)
pheatmap(tomap,
         annotation_row=annotation,
         annotation_colors = ann_colors,
         cluster_cols = F,
         show_rownames = F,
         col=cols
)

Box plots

viruses = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")


ctthreshold = 35

annotation = metadata.Valid[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata.Valid[,1]
colnames(annotation) = rev(colnames(tomap))

annotation[annotation >= ctthreshold] = NA

annotation[annotation < ctthreshold] = 1

annotation[is.na(annotation)] = 0

par(mfrow=c(4,3), mar=c(4, 4, 2.5, 1.5))

sens_scores = vector(length = 12) # for each virus
spec_scores = vector(length = 12) # for each virus

for (viralcount in 1:12)
{   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        #plot(FPR,TPR,type="l",main=virus)

        boxplot(tomap[which(annotation[,virus] == 0),virus],tomap[which(annotation[,virus] == 1),virus],main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        #print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))
}

kable(cbind(virus = viruses,sens = sens_scores,spec = spec_scores))
virus sens spec
INFA 100 93.6585365853659
INFB 100 97.1563981042654
INFC 33.3333333333333 95.8139534883721
MPV 100 90.8653846153846
RSV 90 92.4170616113744
HRV 73.469387755102 77.2357723577236
PIV1 100 94.4954128440367
PIV2 100 98.6175115207373
PIV3 100 91.4691943127962
PIV4 90.9090909090909 91.4285714285714
ADV 44.4444444444444 96.551724137931
EVD68 100 90

Calculation of viral load and comparison with ct values

genomeSizes = c("INFA" = 13.63, "INFB" = 14.45, "INFC" = 12.9, "MPV" = 13.35, "RSV" = 15.222, "HRV" = 7.171333, "PIV1" = 15.6, "PIV2" = 15.646, "PIV3" = 15.462, "PIV4" = 17.052, "ADV" = 35.1435, "EVD68" = 7.367)

#check that names match
names(genomeSizes) == colnames(viral_reads)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
viral_reads_adjusted = viral_reads

for (i in 1:ncol(viral_reads_adjusted))
{
        viral_reads_adjusted[,i] = log((viral_reads[,i]/genomeSizes[i]) + 1, 10)
}

countsVsCT = 
rbind(

cbind(1,viral_reads_adjusted[,"INFA"],metadata.Valid[,"fluact"]),
cbind(2,viral_reads_adjusted[,"INFB"],metadata.Valid[,"flubct"]),
cbind(3,viral_reads_adjusted[,"INFC"],metadata.Valid[,"flucct"]),
cbind(4,viral_reads_adjusted[,"MPV"],metadata.Valid[,"MPV_Ct"]),
cbind(5,viral_reads_adjusted[,"RSV"],metadata.Valid[,"RSV_Ct"]),
cbind(6,viral_reads_adjusted[,"HRV"],metadata.Valid[,"HRV_Ct"]),
cbind(7,viral_reads_adjusted[,"PIV1"],metadata.Valid[,"PIV1.Ct"]),
cbind(8,viral_reads_adjusted[,"PIV2"],metadata.Valid[,"PIV2.Ct"]),
cbind(9,viral_reads_adjusted[,"PIV3"],metadata.Valid[,"PIV3.Ct"]),
cbind(10,viral_reads_adjusted[,"PIV4"],metadata.Valid[,"PIV4.Ct"]),
cbind(11,viral_reads_adjusted[,"ADV"],metadata.Valid[,"ADV_Ct"]),
cbind(12,viral_reads_adjusted[,"EVD68"],metadata.Valid[,"EV.D68_Ct"])
)


df = data.frame(x = 1/countsVsCT[,3], y = countsVsCT[,2], group = colnames(viral_reads)[countsVsCT[,1]])

p = ggplot(df, aes(x = x, y = y, color = group)) +
  geom_point() + geom_smooth(method=lm, se=FALSE,color="gray")

p

# Pearson correlation

cor.test(df$x,df$y,use = "complete")
## 
##  Pearson's product-moment correlation
## 
## data:  df$x and df$y
## t = 17.872, df = 252, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6878744 0.7973643
## sample estimates:
##       cor 
## 0.7476573

Optional: exploring prediction accuracy across multiple CT thresholds

avg_sensitivity = vector(length = 45)
avg_specificity = vector(length = 45)


for (count in 1:45) #this is the ct threshold

{
    ctthreshold = count

    annotation = metadata.Valid[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
    rownames(annotation) = metadata.Valid[,1]
    colnames(annotation) = rev(colnames(tomap))

    annotation[annotation >= ctthreshold] = NA

    annotation[annotation < ctthreshold] = 1

    annotation[is.na(annotation)] = 0

    par(mfrow=c(4,3))

    sens_scores = vector(length = 12) # for each virus
    spec_scores = vector(length = 12) # for each virus

    for (viralcount in 1:12)
    {   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        plot(FPR,TPR,type="l",main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))

    }

    print(paste(count,mean(sens_scores)))
    print(paste(count,mean(spec_scores)))

    avg_sensitivity[count] = mean(sens_scores)
    avg_specificity[count] = mean(spec_scores)

}

Calculation of total bacterial pathogen and viral abundance


Here, we will calculate the total relative abundance of the three bacterial sinusitis pathogens (HFLU, MCAT, SPN), as well as the total relative abundance of respiratory viruses. This is done by summing their log10(RPM) values.

viral_pathogens = c("Respiratory syncytial virus","Enterovirus D","Human coronavirus 229E","Human coronavirus HKU1","Betacoronavirus 1","Human coronavirus NL63","Human mastadenovirus B","Human mastadenovirus C","Human metapneumovirus","Human orthopneumovirus","Human orthorubulavirus 2","Human orthorubulavirus 4","Human respirovirus 1","Human respirovirus 3","Influenza A virus","Influenza B virus","Influenza C virus","Rhinovirus A","Rhinovirus B","Rhinovirus C")

panel_viruses = c("Respiratory syncytial virus","Enterovirus D","Human mastadenovirus B","Human mastadenovirus C","Human metapneumovirus","Human orthopneumovirus","Human orthorubulavirus 2","Human orthorubulavirus 4","Human respirovirus 1","Human respirovirus 3","Influenza A virus","Influenza B virus","Influenza C virus","Rhinovirus A","Rhinovirus B","Rhinovirus C")

three_bacteria = c("Haemophilus influenzae", "Moraxella catarrhalis", "Streptococcus pneumoniae")



bacPathAbundance = apply(matr[,three_bacteria],1,sum)  # sum of their individual RPMs

viralPathAbundance = apply(matr[,viral_pathogens],1,sum)   # sum of their individual RPMs


metadata = metadata %>% mutate(threebac_quantile = ntile(bacPathAbundance, 10), viral_quantile = ntile(viralPathAbundance, 10))


metadata$high_bac<- ifelse(metadata$threebac_quantile >=6, 1, 0)
metadata$high_viral<- ifelse(metadata$viral_quantile >=6, 1, 0)

metadata$high_pathogens <- ifelse(metadata$threebac_quantile >=6 & metadata$viral_quantile >=6,3, 
                           ifelse(metadata$threebac_quantile >=6, 2,
                           ifelse(metadata$viral_quantile >=6, 1, 0)))

Exploratory analysis of newly identified pathogens


To explore, we will first subset the data to all viruses (species with “virus” in their name) with greater than log10(rpm) +1 > 0.3 (roughly 1 read per million), as well as any species with log10+1 abundance greater than 3 since this threshold includes our three bacterial sinusitis pathogens (SPN, HFU, MCAT).

matr.subset = matr.Valid[,unique(c(which(apply(log(matr.Valid + 1,10),2,max) > 3),intersect(grep("virus",colnames(matr.Valid)),which(apply(log(matr.Valid + 1,10),2,max) > 0.3))))]

pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"

par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3)

pheatmap(t(log(matr.Valid[,names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5)

The above list of species was then investigated further using BLAST and literature searches to identify a smaller subset of pathogens of interest.

pathogensOfInterest = c("Influenza A virus", "Influenza C virus", "Haemophilus influenzae", "Human coronavirus NL63", "Influenza B virus", "Betacoronavirus 1", "Respiratory syncytial virus", "Human coronavirus HKU1", "Human respirovirus 3", "Human orthopneumovirus", "Moraxella nonliquefaciens", "Fusobacterium nucleatum", "Human mastadenovirus C", "Human metapneumovirus", "Moraxella catarrhalis", "Prevotella intermedia", "Metamycoplasma salivarium", "Streptococcus pneumoniae", "Pasteurella multocida", "Human orthorubulavirus 4", "Tannerella forsythia", "Corynebacterium pseudodiphtheriticum", "Moraxella osloensis", "Enterovirus B", "Mycoplasma pneumoniae", "Enterovirus A", "Human orthorubulavirus 2", "Chlamydia pneumoniae", "Treponema medium", "Human coronavirus 229E", "Rhinovirus C", "Enterovirus D", "Rhinovirus A", "Human respirovirus 1", "Murine leukemia virus", "Rhinovirus B", "Human mastadenovirus B", "Human gammaherpesvirus 4", "Human betaherpesvirus 5", "Mamastrovirus 9", "Cardiovirus B", "Betapolyomavirus quartihominis", "Parechovirus A")


matr.subset = matr[,pathogensOfInterest]

pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"
par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3,cex.names=0.5)

#plot the pathogen-positive samples (basedon culture/PCR)
breaksList = seq(0, 5, by = 1)
cols = c("white",colorRampPalette(brewer.pal(n = 4, name="YlGnBu"))(4))

temp.matr = t(log(matr[which(apply(metadata[,c(27,64)],1,sum) != 0),names(pathTable)] + 1,10))

temp.matr[temp.matr > 5] = 5 # adds ceiling to data to make color range consistent with next plot

pheatmap(temp.matr,cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color =rev(cols), breaks=breaksList)

#plot the pathogen-negative samples (basedon culture/PCR)
pheatmap(t(log(matr[which(apply(metadata[,c(27,64)],1,sum) == 0),names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color = rev(cols),breaks=breaksList)

2. Gene Expression Analyses


Loading data and formatting metadata

genesymbols = read.delim("host-expression/gencode.v39.metadata.HGNC")

metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
metadata[,"batch"] = as.factor(metadata[,"batch"])

metadata[,"bv_both_none"] = as.factor(metadata[,"bv_both_none"])

metadata[which(as.numeric(metadata[,"cum_pathogn"]) > 1),"cum_pathogn"] = 1

metadata[,"cum_pathogn"] = as.factor(metadata[,"cum_pathogn"])

metadata[,"sex"] = as.factor(metadata[,"sex"])

metadata$age_scaled = scale(metadata$Age.in.years)

DESEQ2 pairwise comparison of bacteria-only vs virus-only group

onlyvirusonlybacteria_samples = metadata[which((metadata$bv_both_none == 1) | (metadata$bv_both_none == 2)),]

onlyvirusonlybacteria_samples = onlyvirusonlybacteria_samples %>% arrange(cum_pathogn)

subsetfiles <- file.path("host-expression/quants", onlyvirusonlybacteria_samples$Filename, "quant.sf")

subset_txi.salmon <- tximport(subsetfiles, type = "salmon", tx2gene = genesymbols)

dds <- DESeqDataSetFromTximport(subset_txi.salmon, onlyvirusonlybacteria_samples, ~ batch + sex + age_scaled + cum_pathogn)

dds <- DESeq(dds)

res <- results(dds)

summary(res)
## 
## out of 31126 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2944, 9.5%
## LFC < 0 (down)     : 1891, 6.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 8333, 27%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
pairwise_genes = which(res$padj < 0.001)

bac_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange > 0))

viral_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange < 0))

Number of DEGs detected:

There are 548 bacterial upDEGs and 273 viral upDEGs.

Volcano Plot

EnhancedVolcano(res,
  lab = rownames(res),
  x = 'log2FoldChange',
  y = 'pvalue',
  pCutoff = 0.001,
  FCcutoff = 0,
  xlim = c(-7, 7),
    ylim=c(-1,27),
  pointSize = 1.5,
  labSize = 2.5,
  title = 'DESeq2 results',
  subtitle = 'Differential expression',
  caption = 'p-value cutoff, 0.001',
  legendPosition = "none",
  legendLabSize = 14,
  col = c('grey30', 'light gray', 'royalblue', 'red2'),
  colAlpha = 0.9,
  drawConnectors = FALSE,
  widthConnectors = 0.5)

Host-response / pathogen abundance correlations

Read in full set of files, including those with NO pathogens and BOTH (virus and bacteria)

files <- file.path("host-expression/quants", metadata$Filename, "quant.sf")
txi.salmon <- tximport(files, type = "salmon", tx2gene = genesymbols)
pdds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~ batch)
prld <- varianceStabilizingTransformation(pdds, blind = TRUE)

Jitter plots of expression levels for selected genes (previous biomarkers)

These are plots of host gene expression per clinical group

pathOrNot = as.numeric(metadata[,"bv_both_none"]) # using clinical categories

genelist = c("CXCL10", "PRF1", "IFI27", "CCL8", "PTGS2", "S100A9", "PLAUR", "TNFAIP3","IL1B")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  print(gene)
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
 theme(axis.title = element_blank(), legend.position="none") + 
    labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
## [1] "CXCL10"
## [1] "PRF1"
## [1] "IFI27"
## [1] "CCL8"
## [1] "PTGS2"
## [1] "S100A9"
## [1] "PLAUR"
## [1] "TNFAIP3"
## [1] "IL1B"
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + theme(legend.position="right") + plot_layout(nrow = 3)

Setting up data for sorted heatmaps

metadata.Valid = metadata[which(metadata[,"batch"] != 6),]

#setting up a temp matrix with Z-score scaled expression levels
temp = t(scale(t(assay(prld[,which(metadata[,"batch"] != 6)])))) 
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0

annotation_col = data.frame(
    Category = metadata.Valid$high_pathogens, 
            Viral_pathogen_abundance = metadata.Valid[,"viral_quantile"],
     Bacterial_pathogen_abundance = metadata.Valid[,"threebac_quantile"],
     Symptom_severity_score = metadata.Valid[,"PRSS"],
     Days_with_cold = metadata.Valid[,"days_with_cold"],
     Infection = factor(metadata.Valid[,"bv_both_none"], labels = c("None","Bacterial", "Viral", "Both"))
     )

Viral sorted heatmap

heatmap <- pheatmap(temp[viral_upDEGs,order(viralPathAbundance[-which(metadata$batch ==6)])], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),

    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Bacterial sorted heatmap

heatmap <- pheatmap(temp[bac_upDEGs,order(bacPathAbundance[-which(metadata$batch ==6)])], 

annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "green", "Yes" = "red")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Combined heatmap

heatmap1 <- pheatmap(temp[viral_upDEGs,order(metadata.Valid$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

heatmap2 <- pheatmap(temp[bac_upDEGs,order(metadata.Valid$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Jitter plots of host response scores vs pathogen abundance

temp = t(scale(t(assay(prld[,]))))

viralResponseScore = apply(temp[viral_upDEGs,],2,mean)

bacterialResponseScore = apply(temp[bac_upDEGs,],2,mean)


group = as.numeric(metadata.Valid[,"threebac_quantile"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Bacterial pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") +
    theme(legend.position = "none")
  
  p

group = as.numeric(metadata[,"viral_quantile"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Viral pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)") +
    theme(legend.position = "none")
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") 
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
  
  
  p

  ### Now examining against the clinical categories
  
  
group = as.numeric(metadata$bv_both_none)

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") 
  p

group = as.numeric(metadata$bv_both_none)

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
  
  
  p

Individual gene expression levels across groups

pathOrNot = as.numeric(metadata[,"high_pathogens"]) # using categories "Both low" "Viral high" "Bacterial high" "Both high"

genelist = c("PTGS2", "S100A9", "PLAUR", "TNFAIP3", "SERPINA2", "CXCL2","IL1B", "CXCL11","IFNL1","CXCL10", "PRF1", "IFI27", "CCL8","OAS2")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
  theme(axis.title = element_blank(), legend.position="none") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Viral", "Bacterial", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11 + p12 + p13 + p14 + plot_layout(nrow = 2)

Testing the specificity of the bacterial host response for SPN/HFLU/MCAT

Here, we will compute the correlation between the bacterial host response score and the total bacterial pathogen abundance (MCAT + SPN + HFLU), and compare this to 1000 randomly samples selections of three bacterial species from the data.

abundant_bacteria = setdiff(which(apply(matr,2,max) >= 10),grep("virus",colnames(matr)))
cors = vector(length = 1000)
for (i in 1:1000)
{

        randSetOfThree = sample(abundant_bacteria,3)
        threeBacAbundance = apply(matr[,randSetOfThree],1,sum)  # sum of their individual RPMs
        #ntile(threeBacAbundance, 10)
        cors[i] = cor(bacterialResponseScore,log(threeBacAbundance + 1,10))
}

hist(cors,xlim=c(-1,1),breaks=100)
abline(v = cor(bacterialResponseScore, log(bacPathAbundance + 1,10)))

ROC curves for predicting high/low pathogen abundance based on host response

Viral ROC

response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 1 | metadata.Valid$high_pathogen == 3)] = 1

prediction = viralResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7963905

Bacterial ROC

response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 2 | metadata.Valid$high_pathogen == 3)] = 1

prediction = bacterialResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7786522

Clinical - Viral ROC

response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 2 | metadata.Valid$bv_both_none == 3)] = 1

prediction = viralResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7678653

Clinical - Bacterial ROC

response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 1 | metadata.Valid$bv_both_none == 3)] = 1

prediction = bacterialResponseScore[which(metadata$batch !=6)]
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7802339

Cell type deconvolution analysis using xCell

# Load the xCell data
xCell = read.delim("../xcell_results/xCell_tpm_matrix_xCell_0050111924_RAW.txt", sep = '\t', row.names = 1)
xCell_signif = read.delim("../xcell_results/xCell_tpm_matrix_xCell_0050111924.pvals.txt", sep = '\t', row.names = 1)

colnames(xCell) = 1:221
colnames(xCell_signif) = 1:221

# Mask non-significant values
xCell_filtered = xCell
xCell_filtered[xCell_signif >= 0.2] <- NA  # Set non-significant values to NA

# Replace NA with a low value (e.g., 0)
xCell_filtered_noNA <- xCell_filtered
xCell_filtered_noNA[is.na(xCell_filtered_noNA)] <- 0

#remove rows with no variance

toRemove = which(apply(xCell_filtered_noNA,1,var) == 0)

xCell_filtered_noNA <- xCell_filtered_noNA[-toRemove,]

breaksList = seq(0, 6000, by = 100)
cols = rev(c(colorRampPalette(brewer.pal(n = 4, name="YlGnBu"))(4)))

# Plot the heatmap
pheatmap(xCell_filtered_noNA, scale = "none", na_col = "white", main = "xCell Heatmap (Significant Values Only)", annotation_col = annotation_col,col=cols)

pvals = vector(length = nrow(xCell_filtered_noNA))

for (i in 1:nrow(xCell_filtered_noNA))
{       pvals[i] = kruskal.test(as.numeric(xCell_filtered_noNA[i,]) ~ as.factor(metadata.Valid$high_pathogens))$p.value

}


pheatmap(xCell_filtered_noNA[which(p.adjust((pvals)) < 0.1),order(annotation_col$Category)],  annotation_col = annotation_col,cluster_cols=F,cluster_rows=F)

xCell_filtered_noNA <- xCell_filtered_noNA[which(p.adjust((pvals)) < 0.1),]

Testing the ability of xCell-derived cell quantification to classify samples

Ability to predict RNA-seq based classification

# bacteria

xCell_Bac <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Bac) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
    response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 2 | metadata.Valid$high_pathogen == 3)] = 1

prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))

xCell_Bac[i,1] = auc(response,prediction)

}

barplot((t(xCell_Bac[order(xCell_Bac),])),las=3,ylim=c(0,1),main="xCell prediction of high_bacterial category from RNA-seq")

print(paste("Top score is:",max(xCell_Bac[,1])))
## [1] "Top score is: 0.757501229709789"
# viruses

xCell_Vir <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Vir) = rownames(xCell_filtered_noNA)

for (i in 1:nrow(xCell_filtered_noNA))
{
    response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$high_pathogen == 1 | metadata.Valid$high_pathogen == 3)] = 1

prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Vir[i,1] = auc(response,prediction)

}

barplot((t(xCell_Vir[order(xCell_Vir),])),las=3,ylim=c(0,1),main="xCell prediction of high_viral category from RNA-seq")

print(paste("Top score is:",max(xCell_Vir[,1])))
## [1] "Top score is: 0.839704675963905"

Ability to predict clinical classification

# bacteria

xCell_Bac <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Bac) = rownames(xCell_filtered_noNA)
for (i in 1:nrow(xCell_filtered_noNA))
{
    response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 1 | metadata.Valid$bv_both_none == 3)] = 1

prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))

xCell_Bac[i,1] = auc(response,prediction)

}

barplot((t(xCell_Bac[order(xCell_Bac),])),las=3,ylim=c(0,1),main="xCell prediction of bacterial category from clinical culture")

print(paste("Top score is:",max(xCell_Bac[,1])))
## [1] "Top score is: 0.757426900584795"
# viruses

xCell_Vir <- matrix(ncol = 1, nrow = nrow(xCell_filtered_noNA))
rownames(xCell_Vir) = rownames(xCell_filtered_noNA)

for (i in 1:nrow(xCell_filtered_noNA))
{
    response = vector(length = nrow(metadata.Valid))
response[] = 0
response[which(metadata.Valid$bv_both_none == 2 | metadata.Valid$high_pathogen == 3)] = 1

prediction = as.numeric(xCell_filtered_noNA[i,])
#plot.roc(roc(response,prediction))
xCell_Vir[i,1] = auc(response,prediction)

}

barplot((t(xCell_Vir[order(xCell_Vir),])),las=3,ylim=c(0,1),main="xCell prediction of viral category from clinical culture")

print(paste("Top score is:",max(xCell_Vir[,1])))
## [1] "Top score is: 0.752968617472434"

Analysis of potential batch effects

Ruling out date effect on bacterial pathogen abundance

x = as.Date(metadata$Date.NP.swab.collected,format="%d-%b-%y")
y = bacPathAbundance

data <- data.frame(
  x_numeric = as.numeric(x),        # Convert x to numeric
  y_log = log(y + 1, 10)            # Apply log transformation to y
)

# Plot using ggplot2
ggplot(data, aes(x = x_numeric, y = y_log)) +
  geom_point(color = "blue", size = 3) +   # Scatter points
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Add trendline with confidence interval
  labs(
    title = "Scatter Plot with Trendline",
    x = "Numeric X (e.g., days since 1970-01-01)",
    y = "Log-transformed Y (log10(y + 1))"
  ) +
  theme_minimal()

cor(as.numeric(x),log(y + 1,10))
## [1] NA
cor.test(as.numeric(x),log(y + 1,10))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(x) and log(y + 1, 10)
## t = 0.35519, df = 219, p-value = 0.7228
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1083199  0.1554733
## sample estimates:
##        cor 
## 0.02399436

Ruling out effect on viral pathogen abundance

x = as.Date(metadata$Date.NP.swab.collected,format="%d-%b-%y")
y = viralPathAbundance
data <- data.frame(
  x_numeric = as.numeric(x),        # Convert x to numeric
  y_log = log(y + 1, 10)            # Apply log transformation to y
)

# Plot using ggplot2
ggplot(data, aes(x = x_numeric, y = y_log)) +
  geom_point(color = "blue", size = 3) +   # Scatter points
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Add trendline with confidence interval
  labs(
    title = "Scatter Plot with Trendline",
    x = "Numeric X (e.g., days since 1970-01-01)",
    y = "Log-transformed Y (log10(y + 1))"
  ) +
  theme_minimal()

cor(as.numeric(x),log(y + 1,10))
## [1] NA
cor.test(as.numeric(x),log(y + 1,10))
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(x) and log(y + 1, 10)
## t = 0.8874, df = 219, p-value = 0.3758
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07268811  0.19032512
## sample estimates:
##        cor 
## 0.05985734

Ruling out batch effect on bacterial abundance

# Prepare the data for ggplot2
data <- data.frame(
  log_bacPathAbundance = log(bacPathAbundance + 1, 10),
  batch = metadata$batch
)

# Create the boxplot using ggplot2
ggplot(data, aes(x = batch, y = log_bacPathAbundance)) +
  geom_boxplot(outlier.color = "red", outlier.size = 2) +  # Boxplot with outlier styling
  labs(
    x = NULL,  # No x-axis label
    y = "log10(Pathogen Abundance)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)  # Adjust x-axis text angle if needed
  )

Ruling out batch effect on viral pathogen abundance

# Prepare the data for ggplot2
data <- data.frame(
  log_viralPathAbundance = log(viralPathAbundance + 1, 10),
  batch = metadata$batch
)

# Create the boxplot using ggplot2
ggplot(data, aes(x = batch, y = log_viralPathAbundance)) +
  geom_boxplot(outlier.color = "red", outlier.size = 2) +  # Boxplot with outlier styling
  labs(
    x = NULL,  # No x-axis label
    y = "log10(Pathogen Abundance)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)  # Adjust x-axis text angle if needed
  )

PCA plots

vst_matrix <- assay(prld)  # Extract matrix from variance-stabilized object

# Perform PCA
pca <- prcomp(t(vst_matrix))  # Transpose required: genes as columns, samples as rows

# Prepare data for ggplot
pca_data <- data.frame(
  PC1 = pca$x[, 1],   # First principal component
  PC2 = pca$x[, 2],   # Second principal component
  Batch = metadata$batch  # Add batch information for coloring
)

# Create PCA plot
ggplot(pca_data, aes(x = PC1, y = PC2, color = Batch)) +
  geom_point(size = 3) +  # Plot points
  labs(
    title = "PCA Plot of Variance-Stabilized Data",
    x = "PC1",
    y = "PC2",
    color = "Batch"
  ) +
  theme_minimal() +
  theme(
    legend.position = "right",
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold")
  )